library(car)
## Loading required package: carData
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(TSstudio)
library(ggplot2)
library(tibble)
library(caret)
## Loading required package: lattice
## Load packages
# Data Prep and EDA
library(knitr)
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
library(corrplot)
## corrplot 0.92 loaded
# Logistic Reg. and Model Selection
library(caTools)
library(car)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-3
library(caret)
# # KNN
# library(kknn)
# # Decision Tree and Random Forest
# library(rpart)
# library(rpart.plot)
# library(randomForest)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

train3 <- read.csv("train3.csv")
test <- read.csv("test.csv")
train=train3[-c(1)]
test=test[-c(1)]
train$RFMSeg=as.factor(train$RFMSeg)
train$Is_Buying_Most_Popular = as.factor(train$Is_Buying_Most_Popular)
train$Country = as.factor(train$Country)

test$RFMSeg=as.factor(test$RFMSeg)
test=test[-c(12)]

test$Is_Buying_Most_Popular = as.factor(test$Is_Buying_Most_Popular)
test$Country = as.factor(test$Country)
# final summary of the dataset
summary(train)
##  Orders_Unique    Returns_Unique    Total_Items_Purchased Quantity_Basket  
##  Min.   : 0.000   Min.   : 0.0000   Min.   :    0.0       Min.   :   0.00  
##  1st Qu.: 1.000   1st Qu.: 0.0000   1st Qu.:  146.0       1st Qu.:  91.29  
##  Median : 2.000   Median : 0.0000   Median :  326.0       Median : 158.00  
##  Mean   : 3.083   Mean   : 0.7027   Mean   :  829.2       Mean   : 230.20  
##  3rd Qu.: 3.000   3rd Qu.: 1.0000   3rd Qu.:  728.5       3rd Qu.: 265.10  
##  Max.   :51.000   Max.   :15.0000   Max.   :78758.0       Max.   :4582.71  
##  Total_Items_Returned Types_Items_Purchased Unique_Item_Per_Basket
##  Min.   :-1296.00     Min.   :  0.00        Min.   :  0.000       
##  1st Qu.:   -3.50     1st Qu.: 15.00        1st Qu.:  9.333       
##  Median :    0.00     Median : 32.00        Median : 17.000       
##  Mean   :  -13.32     Mean   : 46.78        Mean   : 21.498       
##  3rd Qu.:    0.00     3rd Qu.: 61.00        3rd Qu.: 27.708       
##  Max.   :    0.00     Max.   :467.00        Max.   :148.500       
##  Types_Items_Returned Unique_Item_Per_Return Sales_Revenue     
##  Min.   : 0.000       Min.   : 0.0000        Min.   :     0.0  
##  1st Qu.: 0.000       1st Qu.: 0.0000        1st Qu.:   300.7  
##  Median : 0.000       Median : 0.0000        Median :   609.7  
##  Mean   : 1.418       Mean   : 0.8432        Mean   :  1392.1  
##  3rd Qu.: 1.000       3rd Qu.: 1.0000        3rd Qu.:  1281.8  
##  Max.   :45.000       Max.   :45.0000        Max.   :110713.1  
##  Return_Refund      Average_Unit_Price_Purchase Average_Unit_Refund_Return
##  Min.   :-3614.40   Min.   :  0.000             Min.   :  0.00            
##  1st Qu.:  -15.90   1st Qu.:  1.447             1st Qu.:  0.00            
##  Median :    0.00   Median :  1.863             Median :  0.00            
##  Mean   :  -38.21   Mean   :  2.450             Mean   :  3.59            
##  3rd Qu.:    0.00   3rd Qu.:  2.433             3rd Qu.:  2.35            
##  Max.   :    0.00   Max.   :295.000             Max.   :722.88            
##            Country     Is_Buying_Most_Popular    Recency      
##  Others        : 121   0:856                  Min.   :  0.00  
##  United Kingdom:1258   1:523                  1st Qu.: 14.00  
##                                               Median : 41.00  
##                                               Mean   : 54.16  
##                                               3rd Qu.: 82.00  
##                                               Max.   :181.00  
##     Y_Income        RFMSeg  
##  Min.   :     1.0   0:  37  
##  1st Qu.:   341.8   1:1070  
##  Median :   767.0   2: 272  
##  Mean   :  1885.3           
##  3rd Qu.:  1642.1           
##  Max.   :168820.9

Full model

full.model <- lm(Y_Income~ ., data = train)
summary(full.model)
## 
## Call:
## lm(formula = Y_Income ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20994.5   -519.1   -170.5    366.8  30363.9 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -325.55358  404.54171  -0.805  0.42111    
## Orders_Unique               -135.49509   28.03361  -4.833 1.50e-06 ***
## Returns_Unique               -88.98896   77.15597  -1.153  0.24896    
## Total_Items_Purchased         -0.38652    0.09518  -4.061 5.17e-05 ***
## Quantity_Basket               -0.20879    0.25732  -0.811  0.41727    
## Total_Items_Returned          -0.38686    1.15234  -0.336  0.73713    
## Types_Items_Purchased         -6.05021    2.28675  -2.646  0.00824 ** 
## Unique_Item_Per_Basket         3.93614    5.07074   0.776  0.43774    
## Types_Items_Returned           2.78044   42.28039   0.066  0.94758    
## Unique_Item_Per_Return       -72.86254   52.38147  -1.391  0.16445    
## Sales_Revenue                  1.78890    0.06531  27.390  < 2e-16 ***
## Return_Refund                 -0.40498    0.59072  -0.686  0.49311    
## Average_Unit_Price_Purchase   -3.71719    6.17393  -0.602  0.54722    
## Average_Unit_Refund_Return    -4.39604    2.93526  -1.498  0.13445    
## CountryUnited Kingdom        342.48439  204.18272   1.677  0.09371 .  
## Is_Buying_Most_Popular1       -0.83823  126.44192  -0.007  0.99471    
## Recency                        0.53403    1.28981   0.414  0.67891    
## RFMSeg1                      497.31935  352.12084   1.412  0.15807    
## RFMSeg2                      519.49917  395.29515   1.314  0.18900    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2075 on 1360 degrees of freedom
## Multiple R-squared:  0.8955, Adjusted R-squared:  0.8941 
## F-statistic: 647.2 on 18 and 1360 DF,  p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(full.model)$coeff[,4]<0.05)
##         Orders_Unique Total_Items_Purchased Types_Items_Purchased 
##                     2                     4                     7 
##         Sales_Revenue 
##                    11
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(full.model)$coeff[,4]>0.05)
##                 (Intercept)              Returns_Unique 
##                           1                           3 
##             Quantity_Basket        Total_Items_Returned 
##                           5                           6 
##      Unique_Item_Per_Basket        Types_Items_Returned 
##                           8                           9 
##      Unique_Item_Per_Return               Return_Refund 
##                          10                          12 
## Average_Unit_Price_Purchase  Average_Unit_Refund_Return 
##                          13                          14 
##       CountryUnited Kingdom     Is_Buying_Most_Popular1 
##                          15                          16 
##                     Recency                     RFMSeg1 
##                          17                          18 
##                     RFMSeg2 
##                          19
paste("number of coefficient:", length(summary(full.model)$coefficients )/4 - 1)
## [1] "number of coefficient: 18"
full.model.transformed <-lm(Y_Income^(1/2)~., data=train)
# Display summary
summary(full.model.transformed)
## 
## Call:
## lm(formula = Y_Income^(1/2) ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.138  -8.590  -1.696   7.189 118.906 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 14.2106116  2.9525255   4.813 1.65e-06 ***
## Orders_Unique                1.5244188  0.2046018   7.451 1.64e-13 ***
## Returns_Unique              -0.7471269  0.5631186  -1.327  0.18481    
## Total_Items_Purchased       -0.0058896  0.0006947  -8.478  < 2e-16 ***
## Quantity_Basket              0.0191830  0.0018781  10.214  < 2e-16 ***
## Total_Items_Returned        -0.0362494  0.0084103  -4.310 1.75e-05 ***
## Types_Items_Purchased       -0.0009672  0.0166897  -0.058  0.95380    
## Unique_Item_Per_Basket       0.0252595  0.0370085   0.683  0.49502    
## Types_Items_Returned         1.0104432  0.3085811   3.274  0.00109 ** 
## Unique_Item_Per_Return      -1.6161617  0.3823033  -4.227 2.52e-05 ***
## Sales_Revenue                0.0070713  0.0004767  14.834  < 2e-16 ***
## Return_Refund                0.0053779  0.0043114   1.247  0.21247    
## Average_Unit_Price_Purchase -0.0146259  0.0450601  -0.325  0.74554    
## Average_Unit_Refund_Return   0.0179782  0.0214229   0.839  0.40150    
## CountryUnited Kingdom       -0.0875648  1.4902164  -0.059  0.95315    
## Is_Buying_Most_Popular1      0.4159744  0.9228295   0.451  0.65223    
## Recency                     -0.0282769  0.0094136  -3.004  0.00271 ** 
## RFMSeg1                      5.8480868  2.5699347   2.276  0.02303 *  
## RFMSeg2                      9.7567188  2.8850400   3.382  0.00074 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.15 on 1360 degrees of freedom
## Multiple R-squared:  0.6997, Adjusted R-squared:  0.6957 
## F-statistic:   176 on 18 and 1360 DF,  p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(full.model.transformed)$coeff[,4]<0.05)
##            (Intercept)          Orders_Unique  Total_Items_Purchased 
##                      1                      2                      4 
##        Quantity_Basket   Total_Items_Returned   Types_Items_Returned 
##                      5                      6                      9 
## Unique_Item_Per_Return          Sales_Revenue                Recency 
##                     10                     11                     17 
##                RFMSeg1                RFMSeg2 
##                     18                     19
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(full.model.transformed)$coeff[,4]>0.05)
##              Returns_Unique       Types_Items_Purchased 
##                           3                           7 
##      Unique_Item_Per_Basket               Return_Refund 
##                           8                          12 
## Average_Unit_Price_Purchase  Average_Unit_Refund_Return 
##                          13                          14 
##       CountryUnited Kingdom     Is_Buying_Most_Popular1 
##                          15                          16
paste("number of coefficient:", length(summary(full.model.transformed)$coefficients )/4 - 1)
## [1] "number of coefficient: 18"
  1. Forward-Backward Stepwise Regression
# Create minimum model including an intercept
min.model <-  lm(Y_Income~ 1 , data = train)


# Identify variables not selected by F-B Stepwise regression
# index.step <- which(!(names(coef(full.model)) %in% names(coef(step.model))))
# cat("\n\n\n Variables not selected by forward-backward stepwise:",
#     names(coef(full.model)[index.step]))

# Perform stepwise regression
step.model <- step(min.model, scope = list(lower = min.model, upper = full.model),
                  direction = "both", trace = FALSE)
summary(step.model)

Call: lm(formula = Y_Income ~ Sales_Revenue + Orders_Unique + Total_Items_Purchased + Types_Items_Purchased + Types_Items_Returned + Average_Unit_Refund_Return + Country, data = train)

Residuals: Min 1Q Median 3Q Max -21030.7 -477.5 -165.5 349.8 30513.5

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 163.83102 200.56172 0.817 0.414150
Sales_Revenue 1.80237 0.05951 30.287 < 2e-16 Orders_Unique -147.28508 20.20997 -7.288 5.30e-13 Total_Items_Purchased -0.40919 0.08244 -4.964 7.79e-07 Types_Items_Purchased -5.03032 1.39013 -3.619 0.000307 Types_Items_Returned -46.44894 17.41150 -2.668 0.007727 ** Average_Unit_Refund_Return -3.72667 1.96528 -1.896 0.058136 .
CountryUnited Kingdom 351.03456 201.63276 1.741 0.081915 .
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 2072 on 1371 degrees of freedom Multiple R-squared: 0.8949, Adjusted R-squared: 0.8944 F-statistic: 1668 on 7 and 1371 DF, p-value: < 2.2e-16

# Signiciant Coefficients
which(summary(step.model)$coeff[,4]<0.05)
    Sales_Revenue         Orders_Unique Total_Items_Purchased 
                2                     3                     4 

Types_Items_Purchased Types_Items_Returned 5 6

print("--------------------------------------------------------------------------------")

[1] “——————————————————————————–”

## Insignificant variables
# Insignificant Coefficients
which(summary(step.model)$coeff[,4]>0.05)
           (Intercept) Average_Unit_Refund_Return 
                     1                          7 
 CountryUnited Kingdom 
                     8 
s = summary(step.model)
length(s$coefficients)

[1] 32

paste("number of coefficient:", length(summary(step.model)$coefficients )/4 - 1)

[1] “number of coefficient: 7”

# Box-Cox transformation
bc<-boxCox(step.model)

# Extract optimal lambda
opt.lambda<-bc$x[which.max(bc$y)]
# Rounded optimal lambda
cat("Optimal Lambda = ", round(opt.lambda/0.5)*0.5, end="\n")
## Optimal Lambda =  0.5
step.model.transformed=lm(formula = Y_Income**(1/2) ~ Sales_Revenue + Orders_Unique + Total_Items_Purchased + 
    Types_Items_Purchased + Types_Items_Returned + Average_Unit_Refund_Return + 
    Country, data = train)
summary(step.model.transformed)
## 
## Call:
## lm(formula = Y_Income^(1/2) ~ Sales_Revenue + Orders_Unique + 
##     Total_Items_Purchased + Types_Items_Purchased + Types_Items_Returned + 
##     Average_Unit_Refund_Return + Country, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -92.953  -9.581  -2.212   7.527 141.597 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                23.7851972  1.5566698  15.280  < 2e-16 ***
## Sales_Revenue               0.0060357  0.0004619  13.067  < 2e-16 ***
## Orders_Unique               1.3435685  0.1568607   8.565  < 2e-16 ***
## Total_Items_Purchased      -0.0029727  0.0006398  -4.646 3.71e-06 ***
## Types_Items_Purchased       0.0312137  0.0107896   2.893  0.00388 ** 
## Types_Items_Returned        0.3591615  0.1351402   2.658  0.00796 ** 
## Average_Unit_Refund_Return -0.0034860  0.0152536  -0.229  0.81927    
## CountryUnited Kingdom      -2.3864777  1.5649827  -1.525  0.12751    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.08 on 1371 degrees of freedom
## Multiple R-squared:  0.6587, Adjusted R-squared:  0.6569 
## F-statistic: 377.9 on 7 and 1371 DF,  p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(step.model.transformed)$coeff[,4]<0.05)
##           (Intercept)         Sales_Revenue         Orders_Unique 
##                     1                     2                     3 
## Total_Items_Purchased Types_Items_Purchased  Types_Items_Returned 
##                     4                     5                     6
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(step.model.transformed)$coeff[,4]>0.05)
## Average_Unit_Refund_Return      CountryUnited Kingdom 
##                          7                          8
paste("number of coefficient:", length(summary(step.model.transformed)$coefficients )/4 - 1)
## [1] "number of coefficient: 7"
  1. Lasso Regression
# Set a seed for reproducibility
set.seed(1)

# Set predictors and response to correct format
x.train <- scale(model.matrix(Y_Income ~ ., train)[,-1])
y.train <- scale(train$Y_Income)

x.train_ws <- model.matrix(Y_Income ~ ., train)[,-1]
y.train_ws <- train$Y_Income

# Use cross validation to find optimal lambda
cv.lasso <- cv.glmnet(x.train, y.train, alpha = 1, nfolds = 10)
cv.lasso$lambda.min
## [1] 0.006786743
# Train Lasso and display coefficients with optimal lambda
lasso.model <- glmnet(x.train, y.train, alpha = 1, nlambda = 100)
coef(lasso.model, cv.lasso$lambda.min)
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                                        s1
## (Intercept)                 -1.515894e-17
## Orders_Unique               -6.565112e-02
## Returns_Unique              -1.163078e-02
## Total_Items_Purchased        .           
## Quantity_Basket             -3.779479e-03
## Total_Items_Returned         .           
## Types_Items_Purchased       -2.668665e-02
## Unique_Item_Per_Basket       .           
## Types_Items_Returned         .           
## Unique_Item_Per_Return      -1.257738e-02
## Sales_Revenue                9.931592e-01
## Return_Refund                .           
## Average_Unit_Price_Purchase  .           
## Average_Unit_Refund_Return  -9.763884e-03
## CountryUnited Kingdom        3.745255e-03
## Is_Buying_Most_Popular1      .           
## Recency                      5.293025e-03
## RFMSeg1                      5.527952e-04
## RFMSeg2                      .
# Identify variables not selected by Lasso
index.lasso <- which(coef(lasso.model, cv.lasso$lambda.min) == 0)
cat("\n\n\n Variables not selected by lasso regression: ",
    names(coef(full.model)[index.lasso]))
## 
## 
## 
##  Variables not selected by lasso regression:  Total_Items_Purchased Total_Items_Returned Unique_Item_Per_Basket Types_Items_Returned Return_Refund Average_Unit_Price_Purchase Is_Buying_Most_Popular1 RFMSeg2
# Retrain OLS model using Lasso-selected predictors
lasso.predictors <- as.data.frame(x.train_ws)[-(index.lasso-1)]
lasso.retrained <- lm(y.train_ws ~ ., data = lasso.predictors)
summary(lasso.retrained)
## 
## Call:
## lm(formula = y.train_ws ~ ., data = lasso.predictors)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -19236   -518   -196    344  32005 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 228.05282  273.66698   0.833   0.4048    
## Orders_Unique              -139.83618   25.03943  -5.585 2.82e-08 ***
## Returns_Unique              -44.07081   55.48524  -0.794   0.4272    
## Quantity_Basket              -0.45998    0.23093  -1.992   0.0466 *  
## Types_Items_Purchased        -3.93727    1.43079  -2.752   0.0060 ** 
## Unique_Item_Per_Return      -47.22059   26.67094  -1.770   0.0769 .  
## Sales_Revenue                 1.53739    0.01993  77.148  < 2e-16 ***
## Average_Unit_Refund_Return   -3.40148    2.00504  -1.696   0.0900 .  
## `CountryUnited Kingdom`     240.09303  203.61760   1.179   0.2385    
## Recency                       0.97199    1.24052   0.784   0.4334    
## RFMSeg1                      63.72478  161.33859   0.395   0.6929    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2088 on 1368 degrees of freedom
## Multiple R-squared:  0.8936, Adjusted R-squared:  0.8928 
## F-statistic:  1148 on 10 and 1368 DF,  p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(lasso.retrained)$coeff[,4]<0.05)
##         Orders_Unique       Quantity_Basket Types_Items_Purchased 
##                     2                     4                     5 
##         Sales_Revenue 
##                     7
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(lasso.retrained)$coeff[,4]>0.05)
##                (Intercept)             Returns_Unique 
##                          1                          3 
##     Unique_Item_Per_Return Average_Unit_Refund_Return 
##                          6                          8 
##    `CountryUnited Kingdom`                    Recency 
##                          9                         10 
##                    RFMSeg1 
##                         11
paste("number of coefficient:", length(summary(lasso.retrained)$coefficients )/4 - 1)
## [1] "number of coefficient: 10"
#Plot the regression coefficient path.

set.seed(1)

lassomodel = glmnet(x.train, y.train, alpha=1, nlambda=100)

## Plot coefficient paths
plot(lassomodel, xvar="lambda", label=TRUE, lwd=2)
abline(v=log(cv.lasso$lambda.min), col='black', lty=2, lwd=2)

# Box-Cox transformation
bc<-boxCox(lasso.retrained)

# Extract optimal lambda
opt.lambda<-bc$x[which.max(bc$y)]
# Rounded optimal lambda
cat("Optimal Lambda = ", round(opt.lambda/0.5)*0.5, end="\n")
## Optimal Lambda =  0.5
lasso.retrained.transformed <- lm(y.train_ws**(1/2) ~ ., data = lasso.predictors)
summary(lasso.retrained.transformed)
## 
## Call:
## lm(formula = y.train_ws^(1/2) ~ ., data = lasso.predictors)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74.74  -8.82  -1.79   7.09 135.74 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                23.474078   2.059502  11.398  < 2e-16 ***
## Orders_Unique               1.561800   0.188436   8.288 2.71e-16 ***
## Returns_Unique              1.100590   0.417558   2.636  0.00849 ** 
## Quantity_Basket             0.015342   0.001738   8.828  < 2e-16 ***
## Types_Items_Purchased       0.018765   0.010767   1.743  0.08161 .  
## Unique_Item_Per_Return     -0.157339   0.200714  -0.784  0.43324    
## Sales_Revenue               0.003314   0.000150  22.099  < 2e-16 ***
## Average_Unit_Refund_Return -0.015773   0.015089  -1.045  0.29605    
## `CountryUnited Kingdom`    -1.525716   1.532340  -0.996  0.31958    
## Recency                    -0.021147   0.009336  -2.265  0.02366 *  
## RFMSeg1                    -2.209740   1.214166  -1.820  0.06898 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.71 on 1368 degrees of freedom
## Multiple R-squared:  0.6749, Adjusted R-squared:  0.6725 
## F-statistic:   284 on 10 and 1368 DF,  p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(lasso.retrained.transformed)$coeff[,4]<0.05)
##     (Intercept)   Orders_Unique  Returns_Unique Quantity_Basket   Sales_Revenue 
##               1               2               3               4               7 
##         Recency 
##              10
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(lasso.retrained.transformed)$coeff[,4]>0.05)
##      Types_Items_Purchased     Unique_Item_Per_Return 
##                          5                          6 
## Average_Unit_Refund_Return    `CountryUnited Kingdom` 
##                          8                          9 
##                    RFMSeg1 
##                         11
paste("number of coefficient:", length(summary(lasso.retrained.transformed)$coefficients )/4 - 1)
## [1] "number of coefficient: 10"
  1. Elastic Net Regression
# Set a seed for reproducibility
set.seed(1)

# Use cross validation to find optimal lambda
cv.elnet <- cv.glmnet(x.train, y.train, alpha = 0.5)

# Train Elastic Net and display coefficients with optimal lambda
elnet.model <- glmnet(x.train, y.train, alpha = 0.5)
coef(elnet.model, cv.elnet$lambda.min)
## 19 x 1 sparse Matrix of class "dgCMatrix"
##                                        s1
## (Intercept)                 -1.472182e-17
## Orders_Unique               -6.769646e-02
## Returns_Unique              -1.254931e-02
## Total_Items_Purchased        .           
## Quantity_Basket             -6.250172e-03
## Total_Items_Returned         .           
## Types_Items_Purchased       -2.705066e-02
## Unique_Item_Per_Basket       .           
## Types_Items_Returned         .           
## Unique_Item_Per_Return      -1.605573e-02
## Sales_Revenue                9.934581e-01
## Return_Refund               -6.694274e-03
## Average_Unit_Price_Purchase  .           
## Average_Unit_Refund_Return  -1.519984e-02
## CountryUnited Kingdom        5.812796e-03
## Is_Buying_Most_Popular1      .           
## Recency                      6.381197e-03
## RFMSeg1                      1.584849e-03
## RFMSeg2                      .
cv.elnet$lambda.min
## [1] 0.008524545
# Identify variables not selected by Elastic Net
index.elnet <- which(coef(elnet.model, cv.elnet$lambda.min) == 0)
cat("\n\n\n Variables not selected by elastic net regression:",
    names(coef(full.model)[index.elnet]))
## 
## 
## 
##  Variables not selected by elastic net regression: Total_Items_Purchased Total_Items_Returned Unique_Item_Per_Basket Types_Items_Returned Average_Unit_Price_Purchase Is_Buying_Most_Popular1 RFMSeg2
elnet.predictors <- as.data.frame(x.train_ws)[-(index.elnet-1)]
elnet.retrained <- lm(y.train_ws ~ ., data = elnet.predictors)
summary(elnet.retrained)
## 
## Call:
## lm(formula = y.train_ws ~ ., data = elnet.predictors)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18973   -523   -198    350  32061 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 266.51592  274.41368   0.971  0.33161    
## Orders_Unique              -146.00608   25.28428  -5.775 9.54e-09 ***
## Returns_Unique              -55.83016   55.87726  -0.999  0.31790    
## Quantity_Basket              -0.47465    0.23093  -2.055  0.04003 *  
## Types_Items_Purchased        -3.76505    1.43339  -2.627  0.00872 ** 
## Unique_Item_Per_Return      -66.11525   28.87933  -2.289  0.02221 *  
## Sales_Revenue                 1.53108    0.02026  75.580  < 2e-16 ***
## Return_Refund                -0.76975    0.45302  -1.699  0.08952 .  
## Average_Unit_Refund_Return   -6.00220    2.52138  -2.381  0.01742 *  
## `CountryUnited Kingdom`     251.32954  203.58474   1.235  0.21722    
## Recency                       0.78335    1.24463   0.629  0.52920    
## RFMSeg1                      49.22705  161.45304   0.305  0.76049    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2086 on 1367 degrees of freedom
## Multiple R-squared:  0.8938, Adjusted R-squared:  0.8929 
## F-statistic:  1046 on 11 and 1367 DF,  p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(elnet.retrained)$coeff[,4]<0.05)
##              Orders_Unique            Quantity_Basket 
##                          2                          4 
##      Types_Items_Purchased     Unique_Item_Per_Return 
##                          5                          6 
##              Sales_Revenue Average_Unit_Refund_Return 
##                          7                          9
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(elnet.retrained)$coeff[,4]>0.05)
##             (Intercept)          Returns_Unique           Return_Refund 
##                       1                       3                       8 
## `CountryUnited Kingdom`                 Recency                 RFMSeg1 
##                      10                      11                      12
paste("number of coefficient:", length(summary(elnet.retrained)$coefficients )/4 - 1)
## [1] "number of coefficient: 11"
  1. Variable Selection Comparison
# Identify variables not selected by F-B Stepwise regression
index.step <- which(!(names(coef(full.model)) %in% names(coef(step.model))))
cat("\n\n\n Variables not selected by forward-backward stepwise:",
    names(coef(full.model)[index.step]))
## 
## 
## 
##  Variables not selected by forward-backward stepwise: Returns_Unique Quantity_Basket Total_Items_Returned Unique_Item_Per_Basket Unique_Item_Per_Return Return_Refund Average_Unit_Price_Purchase Is_Buying_Most_Popular1 Recency RFMSeg1 RFMSeg2
# Identify variables not selected by Lasso
index.lasso <- which(coef(lasso.model, cv.lasso$lambda.min) == 0)
cat("\n\n\n Variables not selected by lasso regression: ",
    names(coef(full.model)[index.lasso]))
## 
## 
## 
##  Variables not selected by lasso regression:  Total_Items_Purchased Total_Items_Returned Unique_Item_Per_Basket Types_Items_Returned Return_Refund Average_Unit_Price_Purchase Is_Buying_Most_Popular1 RFMSeg2
# Identify variables not selected by Elastic Net
index.elnet <- which(coef(elnet.model, cv.elnet$lambda.min) == 0)
cat("\n\n\n Variables not selected by elastic net regression:",
    names(coef(full.model)[index.elnet]))
## 
## 
## 
##  Variables not selected by elastic net regression: Total_Items_Purchased Total_Items_Returned Unique_Item_Per_Basket Types_Items_Returned Average_Unit_Price_Purchase Is_Buying_Most_Popular1 RFMSeg2

Prediction on Test Set

Now, we are on to do the predictions using the models we just created. A classification threshold of 0.5 is used. Note that this threshold could be tuned depending on the sensitivity/specificity tolerance. In this case, it becomes important to identify people that are likely to churn so that the corrective measures can be taken. This means lowering the threshold could be a good idea even if it results in more False Positive cases.

# 1. Prediction for the full model and transformed

# Obtain predicted probabilities for the test set
pred.full = predict(full.model, newdata = test, type = "response")
pred.full.transformed = predict(full.model.transformed, newdata = test, type = "response")**2


# 2. Prediction for the stepwise regression 

# Obtain predicted probabilities for the test set
pred.step = predict(step.model, newdata = test, type = "response")
pred.step.transformed=predict(step.model.transformed, newdata = test, type = "response")**2


# 3. Prediction for the lasso regression


# Set test data to correct format
new_test <- model.matrix( ~ ., test)[,-1]
# Obtain predicted probabilities for the test set
pred.lasso = predict(lasso.retrained, newdata = as.data.frame(new_test),
                     type = "response")
pred.lasso.transformed = predict(lasso.retrained.transformed, newdata = as.data.frame(new_test),
                     type = "response")**2


# 4. Prediction for elastic net regression 

# Set predictors to correct format
x.test <- model.matrix(Y_Income ~ ., test)[,-1]
# Obtain predicted probabilities for the test set
# pred.elnet = as.vector(predict(elnet.model, newx = x.test,
#                                type = "response", s = cv.elnet$lambda.min))
pred.elnet=predict(elnet.retrained, newdata = as.data.frame(x.test),
                     type = "response")

# Create a data frame with the predictions
preds = data.frame(Y_Income = test$Y_Income, pred.full,pred.full.transformed,
                   pred.step,pred.step.transformed, pred.lasso,pred.lasso.transformed, pred.elnet)

Evaluation Metrics

mspe <-function(prediction, testData) 
  { return(mean((testData - prediction)^2))}

mae <-function(prediction, testData) {return(mean(abs(testData - prediction)))}

mape <-function(prediction, testData) {return(mean(abs(testData - prediction)/testData))}

pm <-function(prediction, testData) {return(sum((testData - prediction)^2)/sum((testData - mean(testData))^2))}
report_result = data.frame(matrix(ncol=4,nrow=0, dimnames=list(NULL,c("MSPE", "MAE", "MAPE", "PM"))))

for (i in c(2:8)){
  
  testData=preds[,1]
  prediction=preds[,i]
  mspe_result = mspe(prediction, testData)
  mae_result = mae(prediction, testData)
  mape_result = mape(prediction, testData)
  pm_result = pm(prediction, testData)

  # print(nrow(report_result2))
  # print( c(mspe_result, mae_result, mape_result, pm_result) )
report_result[nrow(report_result)+1,] = c(mspe_result, mae_result, mape_result, pm_result) 
}
   rownames(report_result) <- c("Full", "Full-Transformed", "Step-Wise","Step-Wise-Transformed","Lasso", "Lasso-Transformed","ENet")

Adding R squared and Adjusted r squared

  report_result$R.Squared=c(summary(full.model)$r.squared,summary(full.model.transformed)$r.squared,summary(step.model)$r.squared,summary(step.model.transformed)$r.squared,summary(lasso.retrained)$r.squared,summary(lasso.retrained.transformed)$r.squared,summary(elnet.retrained)$r.squared)
  
  report_result$Adj.R.Squared=c(summary(full.model)$adj.r.squared,summary(full.model.transformed)$adj.r.squared,summary(step.model)$adj.r.squared,summary(step.model.transformed)$adj.r.squared,summary(lasso.retrained)$adj.r.squared,summary(lasso.retrained.transformed)$adj.r.squared,summary(elnet.retrained)$adj.r.squared)

Number of total predictors, and numbder of significant predictors

report_result$number_of_coefficients =c("19","19","8","8","11","11","12")
report_result$number_of_significant_coefficients =c("4","11","3","6","4","6","6")

Adding Residual analysis and GOF From Analysis Below

report_result$OveralL_GOF=c("N","N","N","N","N","N","N")
report_result$Linearity=c("N","N","N","N","N","N","N")
report_result$ConstantVariance=c("N","N","N","N","N","N","N")
report_result$Independence=c("N","N","N","N","N","N","N")
report_result$Normality=c("N","N","N","N","N","N","N")

Residual Analysis of All Models

Full Model

# Get standardized residuals
resids = rstandard(full.model)
par(mfrow=c(2,2))
  for (i in c(1:18)){
  col_name = names(train3[i]) 
  
  if (!(i %in% c(14,15,17,18))){
  plot(train3[,i], resids, xlab= col_name, ylab = "S. Residuals")
  abline(h=0, col="red")
  lines(lowess(train3[,i], resids), col='blue')
  }
}

# Checking for constant variance and uncorrelated errors

# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")

summary_full_model = summary(full.model)
# Plots for normality
hist(resids, col="orange", nclass=15)

qqPlot(resids)
## [1] 667 122
report_result$OveralL_GOF[1] = "not good"
report_result$Linearity[1] = "Seems to be holding"
report_result$ConstantVariance[1] = "Does not seem to be holding"
report_result$Independence[1] =  "Errors are uncorrelated"
report_result$Normality[1] = "Does not seem to be holding"

summary_full_model
## 
## Call:
## lm(formula = Y_Income ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20994.5   -519.1   -170.5    366.8  30363.9 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -325.55358  404.54171  -0.805  0.42111    
## Orders_Unique               -135.49509   28.03361  -4.833 1.50e-06 ***
## Returns_Unique               -88.98896   77.15597  -1.153  0.24896    
## Total_Items_Purchased         -0.38652    0.09518  -4.061 5.17e-05 ***
## Quantity_Basket               -0.20879    0.25732  -0.811  0.41727    
## Total_Items_Returned          -0.38686    1.15234  -0.336  0.73713    
## Types_Items_Purchased         -6.05021    2.28675  -2.646  0.00824 ** 
## Unique_Item_Per_Basket         3.93614    5.07074   0.776  0.43774    
## Types_Items_Returned           2.78044   42.28039   0.066  0.94758    
## Unique_Item_Per_Return       -72.86254   52.38147  -1.391  0.16445    
## Sales_Revenue                  1.78890    0.06531  27.390  < 2e-16 ***
## Return_Refund                 -0.40498    0.59072  -0.686  0.49311    
## Average_Unit_Price_Purchase   -3.71719    6.17393  -0.602  0.54722    
## Average_Unit_Refund_Return    -4.39604    2.93526  -1.498  0.13445    
## CountryUnited Kingdom        342.48439  204.18272   1.677  0.09371 .  
## Is_Buying_Most_Popular1       -0.83823  126.44192  -0.007  0.99471    
## Recency                        0.53403    1.28981   0.414  0.67891    
## RFMSeg1                      497.31935  352.12084   1.412  0.15807    
## RFMSeg2                      519.49917  395.29515   1.314  0.18900    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2075 on 1360 degrees of freedom
## Multiple R-squared:  0.8955, Adjusted R-squared:  0.8941 
## F-statistic: 647.2 on 18 and 1360 DF,  p-value: < 2.2e-16

Full Model Transformed

# Get standardized residuals
resids = rstandard(full.model.transformed)
par(mfrow=c(2,2))
  for (i in c(1:18)){
  col_name = names(train3[i]) 
  
  if (!(i %in% c(14,15,17,18))){
  plot(train3[,i], resids, xlab= col_name, ylab = "S. Residuals")
  abline(h=0, col="red")
  lines(lowess(train3[,i], resids), col='blue')
  }
}

# Checking for constant variance and uncorrelated errors

# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")


# Plots for normality
hist(resids, col="orange", nclass=15)

qqPlot(resids)
## [1] 122 498
report_result$OveralL_GOF[2] = "Average"
report_result$Linearity[2] = "Seems to be holding"
report_result$ConstantVariance[2] = " seem to be holding"
report_result$Independence[2] =  "Errors are uncorrelated"
report_result$Normality[2] = "Improved and moderaetly holding"

Stepwise Transformed

# Get standardized residuals
resids = rstandard(step.model)

par(mfrow=c(2,2))
    
for (i in c(1:18)){
  col_name = names(train[i]) 
  
  if ((col_name %in% c("Sales_Revenue" , "Return_Refund" ,
    "Total_Items_Purchased" , "Types_Items_Purchased" , "Unique_Item_Per_Basket" , 
    "Is_Buying_Most_Popular",  "Country",  "Quantity_Basket", "Recency", 
    "RFMSeg"))){
  plot(train[,i], resids, xlab= col_name, ylab = "S. Residuals")
  abline(h=0, col="red")
  lines(lowess(train[,i], resids), col='blue')
  }
}

# Checking for constant variance and uncorrelated errors

# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")


# Plots for normality
hist(resids, col="orange", nclass=15)

qqPlot(resids)
## [1] 122 667
report_result$OveralL_GOF[3] = "Average"
report_result$Linearity[3] = "Seems to be holding"
report_result$ConstantVariance[3] = "not clearly holding"
report_result$Independence[3] =  "Errors are uncorrelated"
report_result$Normality[3] = "Does not seemd to be holding"

Stepwise Transformed

# Get standardized residuals
resids = rstandard(step.model.transformed)

par(mfrow=c(2,2))
    
for (i in c(1:18)){
  col_name = names(train[i]) 
  
  if ((col_name %in% c("Sales_Revenue" , "Return_Refund" ,
    "Total_Items_Purchased" , "Types_Items_Purchased" , "Unique_Item_Per_Basket" , 
    "Is_Buying_Most_Popular",  "Country",  "Quantity_Basket", "Recency", 
    "RFMSeg"))){
  plot(train[,i], resids, xlab= col_name, ylab = "S. Residuals")
  abline(h=0, col="red")
  lines(lowess(train[,i], resids), col='blue')
  }
}

# Checking for constant variance and uncorrelated errors

# Plot of std. residuals versus fitted values
plot(step.model.transformed$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model.transformed$fitted.values, resids), col='blue')
abline(h=0, col="red")


# Plots for normality
hist(resids, col="orange", nclass=15)

qqPlot(resids)
## [1] 511 667
# OveralL_GOF = Average
# Linearity = Seems to be holding 
# ConstantVariance =  seem to be holding
# Independence =  Errors are uncorrelated
# Normality = Improved and moderaetly holding
  
report_result$OveralL_GOF[4] = "Average"
report_result$Linearity[4] = "Seems to be holding"
report_result$ConstantVariance[4] = "seem to be holding"
report_result$Independence[4] =  "Errors are uncorrelated"
report_result$Normality[4] = "Improved and moderaetly holding"

lasso.retrained Model

# Get standardized residuals
resids = rstandard(lasso.retrained)
par(mfrow=c(2,2))
  for (i in c(1:ncol(lasso.predictors))){
  col_name = names(lasso.predictors[i]) 
  
  if (!(i %in% c(14,15,17,18))){
  plot(lasso.predictors[,i], resids, xlab= col_name, ylab = "S. Residuals")
  abline(h=0, col="red")
  lines(lowess(lasso.predictors[,i], resids), col='blue')
  }
}

# Checking for constant variance and uncorrelated errors

# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")


# Plots for normality
hist(resids, col="orange", nclass=15)

qqPlot(resids)
## [1] 667 122
# OveralL_GOF = Average
# Linearity = Seems to be holding 
# ConstantVariance =  Does not seem to be clearly holding
# Independence =  Errors are uncorrelated
# Normality = Does not seem to be holding

report_result$OveralL_GOF[5] = "Average"
report_result$Linearity[5] = "Seems to be holding"
report_result$ConstantVariance[5] = "Does not seem to be clearly holding"
report_result$Independence[5] =  "Errors are uncorrelated"
report_result$Normality[5] = "Does not seem to be holding"

lasso.retrained.transformed Model

# Get standardized residuals
resids = rstandard(lasso.retrained.transformed)
par(mfrow=c(2,2))
  for (i in c(1:ncol(lasso.predictors))){
  col_name = names(lasso.predictors[i]) 
  
  if (!(i %in% c(14,15,17,18))){
  plot(lasso.predictors[,i], resids, xlab= col_name, ylab = "S. Residuals")
  abline(h=0, col="red")
  lines(lowess(lasso.predictors[,i], resids), col='blue')
  }
}

# Checking for constant variance and uncorrelated errors

# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")


# Plots for normality
hist(resids, col="orange", nclass=15)

qqPlot(resids)
## [1] 122 667
# OveralL_GOF = Average
# Linearity = Seems to be holding 
# ConstantVariance =  seem to be holding
# Independence =  Errors are uncorrelated
# Normality = Improved and moderaetly holding

report_result$OveralL_GOF[6] = "Average"
report_result$Linearity[6] = "Seems to be holding"
report_result$ConstantVariance[6] = "seem to be holding"
report_result$Independence[6] =  "Errors are uncorrelated"
report_result$Normality[6] = "Improved and moderaetly holding"

elnet.retrained Model

# Get standardized residuals
resids = rstandard(elnet.retrained)
par(mfrow=c(2,2))
  for (i in c(1:ncol(elnet.predictors))){
  col_name = names(elnet.predictors[i]) 
  
  if (!(i %in% c(14,15,17,18))){
  plot(elnet.predictors[,i], resids, xlab= col_name, ylab = "S. Residuals")
  abline(h=0, col="red")
  lines(lowess(elnet.predictors[,i], resids), col='blue')
  }
}

# Checking for constant variance and uncorrelated errors

# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")

# Plots for normality
hist(resids, col="orange", nclass=15)
qqPlot(resids)
## [1] 667 122
# OveralL_GOF = Average
# Linearity = Seems to be holding 
# ConstantVariance =  Does not seem to be clearly holding
# Independence =  Errors are uncorrelated
# Normality = Does not seem to be holding

report_result$OveralL_GOF[7] = "Average"
report_result$Linearity[7] = "Seems to be holding"
report_result$ConstantVariance[7] = "Does not seem to be clearly holding"
report_result$Independence[7] =  "Errors are uncorrelated"
report_result$Normality[7] = "Does not seem to be holding"

report_result
##                           MSPE       MAE     MAPE         PM R.Squared
## Full                  13025690 1187.4253 48.05262 0.24220980 0.8954624
## Full-Transformed       4621712  939.8901 38.18104 0.08593969 0.6996922
## Step-Wise             12953894 1170.0673 48.53398 0.24087476 0.8949329
## Step-Wise-Transformed  6125621  990.7629 24.20578 0.11390455 0.6586523
## Lasso                 13762029 1165.0353 45.12102 0.25590185 0.8935603
## Lasso-Transformed     12525764 1022.4075 31.18209 0.23291378 0.6749004
## ENet                  14255322 1191.4448 45.07683 0.26507452 0.8937846
##                       Adj.R.Squared number_of_coefficients
## Full                      0.8940788                     19
## Full-Transformed          0.6957176                     19
## Step-Wise                 0.8943965                      8
## Step-Wise-Transformed     0.6569095                      8
## Lasso                     0.8927823                     11
## Lasso-Transformed         0.6725240                     11
## ENet                      0.8929300                     12
##                       number_of_significant_coefficients OveralL_GOF
## Full                                                   4    not good
## Full-Transformed                                      11     Average
## Step-Wise                                              3     Average
## Step-Wise-Transformed                                  6     Average
## Lasso                                                  4     Average
## Lasso-Transformed                                      6     Average
## ENet                                                   6     Average
##                                 Linearity                    ConstantVariance
## Full                  Seems to be holding         Does not seem to be holding
## Full-Transformed      Seems to be holding                  seem to be holding
## Step-Wise             Seems to be holding                 not clearly holding
## Step-Wise-Transformed Seems to be holding                  seem to be holding
## Lasso                 Seems to be holding Does not seem to be clearly holding
## Lasso-Transformed     Seems to be holding                  seem to be holding
## ENet                  Seems to be holding Does not seem to be clearly holding
##                                  Independence                       Normality
## Full                  Errors are uncorrelated     Does not seem to be holding
## Full-Transformed      Errors are uncorrelated Improved and moderaetly holding
## Step-Wise             Errors are uncorrelated    Does not seemd to be holding
## Step-Wise-Transformed Errors are uncorrelated Improved and moderaetly holding
## Lasso                 Errors are uncorrelated     Does not seem to be holding
## Lasso-Transformed     Errors are uncorrelated Improved and moderaetly holding
## ENet                  Errors are uncorrelated     Does not seem to be holding
test_predictions_results_comparison <- report_result[1:6]
variable_selection <- report_result[7:8]
goodness_of_fit <- report_result[9:13]

test_predictions_results_comparison
##                           MSPE       MAE     MAPE         PM R.Squared
## Full                  13025690 1187.4253 48.05262 0.24220980 0.8954624
## Full-Transformed       4621712  939.8901 38.18104 0.08593969 0.6996922
## Step-Wise             12953894 1170.0673 48.53398 0.24087476 0.8949329
## Step-Wise-Transformed  6125621  990.7629 24.20578 0.11390455 0.6586523
## Lasso                 13762029 1165.0353 45.12102 0.25590185 0.8935603
## Lasso-Transformed     12525764 1022.4075 31.18209 0.23291378 0.6749004
## ENet                  14255322 1191.4448 45.07683 0.26507452 0.8937846
##                       Adj.R.Squared
## Full                      0.8940788
## Full-Transformed          0.6957176
## Step-Wise                 0.8943965
## Step-Wise-Transformed     0.6569095
## Lasso                     0.8927823
## Lasso-Transformed         0.6725240
## ENet                      0.8929300
variable_selection
##                       number_of_coefficients number_of_significant_coefficients
## Full                                      19                                  4
## Full-Transformed                          19                                 11
## Step-Wise                                  8                                  3
## Step-Wise-Transformed                      8                                  6
## Lasso                                     11                                  4
## Lasso-Transformed                         11                                  6
## ENet                                      12                                  6
goodness_of_fit
##                       OveralL_GOF           Linearity
## Full                     not good Seems to be holding
## Full-Transformed          Average Seems to be holding
## Step-Wise                 Average Seems to be holding
## Step-Wise-Transformed     Average Seems to be holding
## Lasso                     Average Seems to be holding
## Lasso-Transformed         Average Seems to be holding
## ENet                      Average Seems to be holding
##                                          ConstantVariance
## Full                          Does not seem to be holding
## Full-Transformed                       seem to be holding
## Step-Wise                             not clearly holding
## Step-Wise-Transformed                  seem to be holding
## Lasso                 Does not seem to be clearly holding
## Lasso-Transformed                      seem to be holding
## ENet                  Does not seem to be clearly holding
##                                  Independence                       Normality
## Full                  Errors are uncorrelated     Does not seem to be holding
## Full-Transformed      Errors are uncorrelated Improved and moderaetly holding
## Step-Wise             Errors are uncorrelated    Does not seemd to be holding
## Step-Wise-Transformed Errors are uncorrelated Improved and moderaetly holding
## Lasso                 Errors are uncorrelated     Does not seem to be holding
## Lasso-Transformed     Errors are uncorrelated Improved and moderaetly holding
## ENet                  Errors are uncorrelated     Does not seem to be holding
write.csv(test_predictions_results_comparison,"output_test_predictions_results_comparison.csv")
write.csv(variable_selection,"output_variable_selection.csv")
write.csv(goodness_of_fit,"output_goodness_of_fit.csv")

Scoring on Next 6 months Data for Future Predictions

future_data <- read.csv("2nd_months_predicting_variables.csv")
# future_data=future_data[-c(1,13)]
future_data$Country2[future_data$Country !="United Kingdom"] <- "Others"
future_data$Country2[future_data$Country =="United Kingdom"] <- "United Kingdom"
future_data$Country=as.factor(future_data$Country2)
future_data$Is_Buying_Most_Popular = as.factor(future_data$Is_Buying_Most_Popular)
future_data$Country = as.factor(future_data$Country)
future_data$RFMSeg=as.integer((future_data$Recency_Quantile+future_data$Frequency_Quantile+future_data$Monetory_Value_Quantile))
future_data$RFMSeg=as.factor(future_data$RFMSeg)


####### Using the model buit to score on Future Data
future_data_predicted = predict(full.model.transformed, newdata = future_data, type = "response")**2

A. Total Revenue and CLV

revenue=sum(future_data_predicted)
gross=1/100
###churn=15.00/100 ###Assuming 15% Churn

###Assuming no Churn
NET_CLV_6months=revenue*gross

print(revenue)
## [1] 28030947
print(NET_CLV_6months)
## [1] 280309.5

B. Bar Graph

# Creates bin
Groups <- cut(x=as.numeric(future_data_predicted)/1000, breaks=seq(from=0, to=ceiling(15), by = 1))


Bygroup = tapply(future_data_predicted, Groups, length)

####Barplot
b=barplot(height = Bygroup, xlab = "Predicted $ Revenue ( in thousands) ", ylab = "#Customers")

Bygroup
##   (0,1]   (1,2]   (2,3]   (3,4]   (4,5]   (5,6]   (6,7]   (7,8]   (8,9]  (9,10] 
##    2262     878     218      68      34      26      16       9       6       7 
## (10,11] (11,12] (12,13] (13,14] (14,15] 
##       8       5       3       3       3
Bygroup/sum(Bygroup)
##        (0,1]        (1,2]        (2,3]        (3,4]        (4,5]        (5,6] 
## 0.6379018613 0.2476029329 0.0614777214 0.0191765369 0.0095882685 0.0073322053 
##        (6,7]        (7,8]        (8,9]       (9,10]      (10,11]      (11,12] 
## 0.0045121263 0.0025380711 0.0016920474 0.0019740553 0.0022560632 0.0014100395 
##      (12,13]      (13,14]      (14,15] 
## 0.0008460237 0.0008460237 0.0008460237